Imports¶

In [3]:
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import r_regression, chi2, SelectKBest
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo

pyo.init_notebook_mode()

Code¶

Helper Functions¶

In [9]:
def is_number(num_str):
    """
    Determines whether a given string input represents
    a number.
    """
    try:
        float(num_str)
        return True
    except ValueError:
        return False

# Vectorised versions of is_number function
vec_isalpha = np.vectorize(lambda x: not is_number(x))
vec_isnum = np.vectorize(lambda x: is_number(x))

def vec_isalpha(): 
    return np.vectorize(lambda x: not is_number(x))
def vec_isnum(): 
    return np.vectorize(lambda x: is_number(x))

def plot_bar_data(*bars, x=None, title="", x_label="", y_label=""):
    """
    Generic function for creating a bar plot.
    """
    fig = go.Figure(
        layout={
            "title": title,
            "xaxis": {"title": x_label},
            "yaxis": {"title": y_label},
            "barmode": "group"
        }, data=[
            go.Bar(name=f"{bar[0]}", x=x, y=bar[1])
            for bar in bars
        ])
    
    return fig

DatasetManager Class¶

In [10]:
class DatasetManager():
    def __init__(self, ds_name):
        self._original_ds = None
        self._clean_ds = None
        self._encoded_ds = None
        self._encodings = None
        self._numerised_ds = None
        self._complete_ds = None
        self._feat_ds = None
        self._feat_cols = None
        self._scores = None
        self._scaled_feat_ds = None
        self._ds_name = ds_name
    
    def get_ds_name(self):
        return self._ds_name
    
    def get_original_ds(self):
        return self._original_ds
    
    def get_clean_ds(self):
        return self._clean_ds
    
    def get_encoded_ds(self):
        return self._encoded_ds
    
    def get_encodings(self):
        return self._encodings
    
    def get_numerised_ds(self):
        return self._numerised_ds
    
    def get_complete_ds(self):
        return self._complete_ds
    
    def get_feat_ds(self):
        return self._feat_ds
    
    def get_feat_ds_cols_and_scores(self):
        return self._feat_cols, self._scores
    
    def get_scaled_feat_ds(self):
        return self._scaled_feat_ds
    
    def load_and_preprocess(self, crit_cols, imp_choice):
        """
        Function for loading and cleaning dataset as well as
        encoding non-numerical values and imputing missing values.
        """
        # Load dataset
        self.load_dataset(self._ds_name)
        print("Dataset loaded...")

        # Clean dataset
        self.clean_dataset(crit_cols)
        print("Dataset cleaned..")

        # Encode dataset
        self.encode_dataset()
        print("Dataset encodings..")

        # Numerise dataset
        self.numerise_dataset()
        print("Dataset numerised...")

        # Impute missing values
        self.impute_dataset(imp_choice)
        print("Missing values imputed...")
    
    def load_dataset(self, ds_name):
        """
        Loads the dataset from csv file into a numpy array.
        """
        # Load dataset
        ds = np.genfromtxt(
            f"{ds_name}.csv", 
            delimiter=",",
            skip_header=True,
            dtype="str"
        )

        # Strip whitespace from start and end of all elements
        self._original_ds = np.char.strip(ds)

    def clean_dataset(self, check_cols=None):
        """
        Filters dataset by removing rows that have missing
        values in at least 1 of a specified set of columns.
        """
        ds = np.copy(self._original_ds)

        # Create a mask to find rows with missing values in critical columns
        if not check_cols:
            missing_rows_mask = np.any(ds == "", axis=1)
        else:
            missing_rows_mask = np.any(ds[:, check_cols] == "", axis=1)

        # Remove rows with missing values using boolean indexing
        self._clean_ds = ds[~missing_rows_mask]

    def encode_dataset(self):
        """
        Encodes non-numerical columns in the dataset.
        """
        self._encoded_ds = np.copy(self._clean_ds)
        self._encodings = {}

        # Get number of columns in dataset
        num_cols = self._encoded_ds.shape[1]

        # Loop through columns to see if any need to be encoded
        for i in range(num_cols):
            if np.all(vec_isalpha()(self._encoded_ds[:, i])):
                # Initialise encoder
                oec = OrdinalEncoder(categories="auto", dtype=float)

                # Fit encoder
                oec.fit(self._encoded_ds[:, [i]])

                # Replace columns
                self._encoded_ds[:, i] = oec.transform(self._encoded_ds[:, [i]]).flatten()

                # Save category encoding
                self._encodings[i] = oec.categories_[0]
    
    def numerise_dataset(self):
        """
        Converts all elements in the dataset from 
        strings to numbers.
        """
        self._numerised_ds = np.where(
            self._encoded_ds == "", 
            np.nan, 
            self._encoded_ds
        ).astype(float)

    def impute_dataset(self, imp_choice="simple"):
        """
        Fills in missing values in the dataset using
        a specified imputation method.
        """
        complete_ds = np.copy(self._numerised_ds)

        # Initialise imputer
        if imp_choice == "simple":
            imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
        elif imp_choice == "knn":
            imputer = KNNImputer(n_neighbors=2)
        elif imp_choice == "iterative":
            imputer = IterativeImputer(random_state=0)

        # Fit imputer and fill missing values
        self._complete_ds = imputer.fit_transform(complete_ds)
    
    def create_feature_set(self, n_features):
        """
        Creates a feature set consisting of n features.
        Features are found using Pearson correlation.
        """
        ds = np.copy(self._complete_ds)
        
        # Selecting features and scores
        feat_selector = SelectKBest(r_regression, k=n_features)
        self._feat_ds = feat_selector.fit_transform(ds[:, :-1], ds[:, -1])
        self._scores = feat_selector.scores_
        self._feat_cols = np.sort(np.argsort(self._scores)[-n_features:])
    
    def scale_feature_set(self):
        """
        Scales the feature set.
        """
        feat_ds = np.copy(self._feat_ds)
        
        # Initialise scaler
        scaler = MinMaxScaler(feature_range=(-1, 1))
        
        # Apply scaler
        self._scaled_feat_ds = scaler.fit_transform(feat_ds)
        
    def test_feature_number(
        self, 
        model_type="clf",
        corr_type="pearson",
        test_size=0.2,
        chart_title="Model Accuracy vs Number of Features"
    ):
        """
        Creates a visualisation of the performance of a basic
        model that's trained on different feature sets created
        from the main dataset.
        """
        ds = np.copy(self._complete_ds)
        
        # Get number features and samples
        n_samples = min(10000, ds.shape[0])
        n_features = ds.shape[1]
        
        # Using pearson correlation to evaluate different feature sets
        x = []
        train_acc_vals = ("Training Accuracy", [])
        test_acc_vals = ("Testing Accuracy", [])
        avg_vals = ("Weighted Average Across Both Sets", [])
        
        for i in range(1, n_features):
            x.append(f"n = {i}")
            
            # Selecting features
            feat_selector = SelectKBest(r_regression, k=i)
            feat_ds = feat_selector.fit_transform(ds[:n_samples, :-1], ds[:n_samples, -1])
            
            # Splitting the dataset
            X_train, X_test, y_train, y_test = train_test_split(feat_ds, ds[:n_samples, -1], test_size=test_size)
            
            # Normalising training data
            scaler = MinMaxScaler(feature_range=(-1, 1))
            X_train_norm = scaler.fit_transform(X_train)
            
            # Model initialisation
            if model_type == "clf": # classification model
                model = SVC()
            elif model_type == "reg": # regression model
                model = SVR()
            
            # Train model using the training data
            model.fit(X_train_norm, y_train)
            
            # Get training accuracy and add it to list
            train_acc = model.score(X_train_norm, y_train)
            train_acc_vals[1].append(train_acc)
            
            # Get testing accuracy and add it to list
            X_test_norm = scaler.transform(X_test)
            test_acc = model.score(X_test_norm, y_test)
            test_acc_vals[1].append(test_acc)
            
            avg_vals[1].append(np.average([train_acc, test_acc], weights=[1-test_size,test_size]))
            
        # Get averages across all feature numbers
        x.append("Average")
        train_acc_vals[1].append(np.mean(train_acc_vals[1]))
        test_acc_vals[1].append(np.mean(test_acc_vals[1]))
        avg_vals[1].append(np.mean(avg_vals[1]))
        
        # Create visualisation of model performance as function of feature number
        performance_data = plot_bar_data(
            train_acc_vals, 
            test_acc_vals,
            avg_vals,
            x=x,
            title=chart_title,
            x_label="Number of Features (n)",
            y_label="Model Accuracy"
        )
        
        return performance_data

Creating Dataset Managers¶

In [11]:
# GWP Dataset
gwp_dsm = DatasetManager("gwp_assessment")

# Star Dataset
star_dsm = DatasetManager("star_assessment")

Evaluating Imputation Methods¶

Running a basic model on different feature sets in order to assess how different imputation methods affect accuracy.

Simple Imputation¶

In [12]:
# Loading and preprocessing
gwp_dsm.load_and_preprocess([0,1,2,3], "simple") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "simple") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
In [6]:
# Performance data for GWP dataset
chart_title = "Model Accuracy vs Number of Features (Simple Imputation Method)"
gwp_dsm.test_feature_number("reg", chart_title=chart_title)
In [7]:
# Performance data for Star dataset
chart_title = "Model Accuracy vs Number of Features (Simple Imputation Method)"
star_dsm.test_feature_number("clf", chart_title=chart_title)

KNN Imputation¶

In [8]:
# Loading and preprocessing
gwp_dsm.load_and_preprocess([0,1,2,3], "knn") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
In [9]:
# Performance data for GWP dataset
chart_title = "Model Accuracy vs Number of Features (KNN Imputation Method)"
gwp_dsm.test_feature_number("reg", chart_title=chart_title)
In [10]:
# Performance data for Star dataset
chart_title = "Model Accuracy vs Number of Features (KNN Imputation Method)"
star_dsm.test_feature_number("clf", chart_title=chart_title)

Iterative Imputation¶

In [11]:
# Loading and preprocessing
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "iterative") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
In [12]:
# Performance data for GWP dataset
chart_title = "Model Accuracy vs Number of Features (Iterative Imputation Method)"
gwp_dsm.test_feature_number("reg", chart_title=chart_title)
In [13]:
# Performance data for Star dataset
chart_title = "Model Accuracy vs Number of Features (Iterative Imputation Method)"
star_dsm.test_feature_number("clf", chart_title=chart_title)

Analysis¶

GWP Dataset

The simple imputation method yields the highest average training accuracy but the iterative method results in a higher average testing accuracy and weighted average between training and testing accuracy. Therefore, the optimal imputation method is the iterative method.

Star Dataset

The KNN imputation method has the slight edge over the other two methods in terms of average training accuracy and the weighted average between average training accuracy and average testing accuracy. Therefore the optimial imputation method is the KNN method.

Getting Final Feature Sets¶

In [14]:
# Loading and preprocessing with optimal imputation methods
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative") # rows with missing values in columns 0,1,2 or 3 will be deleted.
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn") # rows with missing values in columns 0,1,8,9,12,16,17 will be deleted
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...

Finding Ideal Number of Features¶

In [19]:
# Performance data for GWP dataset
gwp_dsm.test_feature_number("reg")
In [16]:
# Performance data for Star dataset
star_dsm.test_feature_number("clf")

Analysis¶

GWP Dataset

Training and testing accuracy steadily increase until n = 7. At this point the two metrics begin to diverge, which would suggest that any feature set with more that 7 features would quickly lead to overfitting. Therefore, the optimal number of features is 7.

Star Dataset

Training and testing accuracy increase until n = 8. At this point both accuracy metrics decrease until n = 11, where there is a sudden spike; this would suggest that overfitting sets in at n = 11. Therefore, the optimal number of features is 8.

Creating Optimial Feature Sets¶

In [20]:
# Creating feature sets of the optimal size for each dataset
gwp_dsm.create_feature_set(7)
star_dsm.create_feature_set(8)

Scaling¶

In [21]:
# Scaling each feature set
gwp_dsm.scale_feature_set()
star_dsm.scale_feature_set()

GWP Dataset¶

In [22]:
# Final scaled GWP dataset
gwp_dsm.get_scaled_feat_ds()
Out[22]:
array([[-1.        ,  0.2       ,  1.        , ..., -0.06349206,
        -0.94555556,  0.31034483],
       [-1.        ,  0.2       ,  0.8630137 , ..., -0.87301587,
        -1.        , -0.86206897],
       [-1.        ,  0.2       ,  1.        , ..., -0.51587302,
        -0.97222222, -0.34482759],
       ...,
       [ 0.75862069,  1.        ,  0.7260274 , ..., -0.87301587,
        -1.        , -0.86206897],
       [ 0.75862069,  1.        ,  0.8630137 , ..., -0.76190476,
        -1.        , -0.70114943],
       [ 0.75862069,  1.        ,  0.7260274 , ..., -0.9047619 ,
        -1.        , -0.90804598]])
In [23]:
# Feature colums for scaled GWP dataset
feat_cols, _ = gwp_dsm.get_feat_ds_cols_and_scores()
feat_cols
Out[23]:
array([ 0,  3,  5,  7,  8,  9, 13])

Star Dataset¶

In [24]:
# Final scaled Star dataset
star_dsm.get_scaled_feat_ds()
Out[24]:
array([[ 0.00760492, -0.14467077,  0.99788799, ..., -0.09682969,
        -0.09681622,  0.29601311],
       [-0.01637576,  0.03197251,  0.9984505 , ...,  0.65766595,
         0.6576826 ,  0.78864009],
       [ 0.06827764, -0.12854131,  0.99791881, ..., -0.29810758,
        -0.29810276,  0.08793009],
       ...,
       [-0.32238054, -0.2748241 ,  0.9976005 , ..., -0.59320128,
        -0.59319274, -0.20070999],
       [ 0.28594879, -0.15289164,  0.99785431, ...,  0.05609863,
         0.05610292,  0.29983616],
       [ 0.34105031, -0.07022863,  0.99798512, ...,  0.16343213,
         0.16342317,  0.50081922]])
In [25]:
# Feature colums for scaled Star dataset
feat_cols, _ = star_dsm.get_feat_ds_cols_and_scores()
feat_cols
Out[25]:
array([ 2,  6,  7,  9, 10, 12, 14, 15])

Markdown Answer¶

The Pearson correlation coefficient measures the linear relationship between two variables. It helps identify relevant features by quantifying their linear relationship with the target variable, and it detects redundancy among features by comparing their pairwise correlations. The strength and direction of the correlation guide the selection of features with the most predictive power. The key benefit of Pearson correlation is its simplicity and ease of interpretation. Pearson correlation is a simple and effective way to identify relevant and non-redundant features for a model. Pearson correlation helps select features that have strong predictive power while minimising the risk of multicollinearity. This yields more efficient and accurate models by reducing the number of features used without sacrificing valuable information content. Its ease of calculation and interpretation of coefficients make it a popular choice for machine learning practitioners, as it allows for straightforward comparison and ranking of features based on their relationship with the target variable.

In [ ]: